set.seed(3)
suppressPackageStartupMessages({
  library(SingleCellExperiment)
  library(rgl)
  library(clusterExperiment)
  library(RColorBrewer)
  library(ggplot2)
  library(dplyr)
})
colby <- function(values, g=4){
  if(is(values, "character")){
    cols <- as.numeric(as.factor(values))
    return(cols)
  }
  if(is(values, "factor")){
    ncl <- nlevels(values)
    if(ncl > 9){
          colpal <- c(RColorBrewer::brewer.pal(9, 'Set1'), wesanderson::wes_palette("Darjeeling1", n=ncl-9, type="continuous"))
    } else {
       colpal <- RColorBrewer::brewer.pal(9, 'Set1')
    }
    cols <- colpal[as.numeric(values)]
    return(cols)
  }
  if(is(values, "numeric")){
    pal <- wesanderson::wes_palette("Zissou1", n=g, type="continuous")
    gg <- Hmisc::cut2(values, g=g)
    if(nlevels(gg) < g){
      nl <- nlevels(gg)
      if(nl == 2) pal <- pal[c(1,g)]
    }
    cols <- pal[gg]
    return(cols)
  }
}
cols <- brewer.pal(9,'Set1')
colsBig <- clusterExperiment::massivePalette
plotDRTimePoint <- function(dr, timePoint, cols, legendCex=1, legendPos="topright", ...){
  plot(dr, col=cols[timePoint], pch=16, cex=1/2, xlab="UMAP1", ylab="UMAP2", bty='l', ...)
  legend(legendPos, legend=levels(timePoint), col=cols[1:nlevels(timePoint)], pch=16, bty='n', cex=legendCex)
}
plotDRExpression <- function(dr, counts){
  df = data.frame(dim1=dr[,1], dim2=dr[,2], y=counts)
  df %>% ggplot(aes(x=dim1, y=dim2, col=y)) + 
    geom_point(size=2/3) + scale_color_viridis_c(alpha=.6) + theme_classic()
}

Import data

dataDir <- "../data"
cl <- readRDS(paste0(dataDir,"/cl_OE.rds"))
load(paste0(dataDir,"/regenK5_scone_none,fq,ruv_k=1,no_bio,batch_rsec_adjP_mergecutoff_0.01_20190609_085359.Rda"))
subset <- readRDS(paste0(dataDir,"/subset_index.rds"))
batch <- droplevels(colData(cl2)$batch[subset])
counts <- assays(cl2)$counts[,subset]
colnames(counts) <- colData(cl2)$samples[subset]
timePoint <- as.character(colData(cl2)$batch[subset])
timePoint <- factor(unlist(lapply(strsplit(timePoint, split="_"), "[[", 1)),
                    levels=c("24HPI", "48HPI", "96HPI", "7DPI", "14DPI", "Mix"))
rm(cl2) ; gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells   7416384  396.1   14481390  773.4         NA   9988747  533.5
## Vcells 337225457 2572.9  926475190 7068.5     102400 770606619 5879.3

Remove doublets

Doublets were identified by Boying Gong using DoubletFinder on each individual sample.

# Remove Doublets
doublets <- read.csv(paste0(dataDir,"/doublet_prediction_sample.csv"), stringsAsFactors = FALSE)
hist(doublets$doubletFinderScore)

doubID <- which(doublets$doubletFinderPred)
doubletCells <- doublets$cell[doubID] #271 doublets (~1%)

counts_noDoubs <- counts[,-doubID]
timePoint_noDoubs <- timePoint[-doubID]
cl_noDoubs <- cl[-doubID]

Dimensionality reduction

library(uwot)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
library(irlba)

### top 100 PCs
set.seed(15)
fVar <- rowVars(counts_noDoubs)
fMean <- rowMeans(counts_noDoubs)
highVarFeatures <- order(fVar, decreasing=TRUE)[1:500]
pcIk <- irlba(log1p(counts_noDoubs[highVarFeatures,]), nv=100)

# UMAP on principal directions
umIk50 <- uwot::umap(pcIk$v, n_components=3)
rownames(umIk50) <- colnames(counts_noDoubs)
plot(umIk50,  col=alpha(brewer.pal(6,'Set1')[timePoint_noDoubs],.4), pch=16, cex=1/2,
     xlab="UMAP1", ylab="UMAP2", bty='l')
legend("bottomright",levels(timePoint),pch=16,col=brewer.pal(6,'Set1'), bty='n')

plot(umIk50,  col=alpha(brewer.pal(9,'Set1')[factor(cl_noDoubs)],.4), pch=16, cex=1/2,
     xlab="UMAP1", ylab="UMAP2", bty='l')
oldCelltype <- c("SUS", "IN3", "rHBC", "IN2", "GBC", "Neuron", "IN1", "HBC", "MV")
oldCelltype <- factor(oldCelltype, levels=oldCelltype)
legend("bottomright",levels(oldCelltype),pch=16,col=brewer.pal(9,'Set1'), bty='n')

plot3d(umIk50, aspect = 'iso', col=brewer.pal(9,'Set1')[factor(cl_noDoubs)], alpha=.3)
plot3d(umIk50, aspect = 'iso', col=brewer.pal(6,'Set1')[timePoint_noDoubs], alpha=.3)

Interpreting (some of) the islands

# 24h and 48h cells at mature neuron stage are true neurons => some neurons slipped through
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Omp",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Scg2",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Neurod1",]), alpha=.3)

# SUS
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Cyp2g1",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Cyp1a2",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Vmo1",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Muc2",]), alpha=.3)

# HBC
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Krt5",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Krt14",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Krt15",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Trp63",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Gpr87",]), alpha=.3)

# GBC
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Sprr1a",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Ascl1",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Prim1",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Tk1",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Plk1",]), alpha=.3)

# MV
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Ascl3",]), alpha=.3)
plot3d(umIk50, aspect = 'iso', col=colby(counts_noDoubs["Trpm5",]), alpha=.3)

Remove chronologically asyncrhonous cells (bridge, mature neuron 24/48h cells)

Bridge cells

plot3d(umIk50, aspect = 'iso', 
       col=cols[timePoint_noDoubs], 
       alpha=.6)


bridgeRegionID <- (umIk50[,1] < -3.3 & umIk50[,1] > -4.5) & (umIk50[,2] < 0)
plot3d(umIk50, aspect = 'iso', 
       col=colby(factor(bridgeRegionID), g=2), alpha=.6)
bridgeAllCells <- which(bridgeRegionID)
bridgeID <- bridgeAllCells[timePoint_noDoubs[bridgeAllCells] %in% levels(timePoint_noDoubs)[1:3]]
# plot cells to be removed
bridgeLogical <- vector(length=nrow(umIk50))
bridgeLogical[bridgeID] <- TRUE
plot3d(umIk50, aspect = 'iso', 
       col=colby(factor(bridgeLogical), g=2), alpha=.4)

bridgeCellNames <- rownames(umIk50)[bridgeID]
rm(bridgeLogical, bridgeAllCells) 

Mature neuron 24/48h

plot3d(umIk50, aspect = 'iso', 
       col=cols[timePoint_noDoubs], 
       alpha=.6)

mnCellID <- which(umIk50[,1] < -6.5)
mnCellID <- mnCellID[timePoint_noDoubs[mnCellID] %in% levels(timePoint_noDoubs)[1:2]]
mnLogical <- vector(length=nrow(umIk50))
mnLogical[mnCellID] <- TRUE
plot3d(umIk50, aspect = 'iso', 
       col=colby(factor(mnLogical), g=2), alpha=.4)

rm(mnLogical)

Remove these

rmAsync <- c(bridgeID, mnCellID)

umIk50_async <- umIk50[-rmAsync,]
counts_noDoubs_async <- counts_noDoubs[,-rmAsync]
timePoint_noDoubs_async <- timePoint_noDoubs[-rmAsync]
cl_noDoubs_async <- cl_noDoubs[-rmAsync]

rm(counts, counts_noDoubs) ; gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used   (Mb)
## Ncells   7610549  406.5   14481390  773.4         NA   14481390  773.4
## Vcells 333060472 2541.1 1111850228 8482.8     102400 1056817977 8062.9

Dimensionality reduction

Using David Brann’s gene selection and PCA objects

dataDavid <- "../data/objectsFromDavid"
davidGenes <- read.table(paste0(dataDavid, "/genes_used.txt"), stringsAsFactors = FALSE)[,1]
davidPC <- readRDS(paste0(dataDavid, "/pcs.rds"))
davidAnnotation <- read.csv(paste0(dataDavid, "/Datta_annotation_HBC_lineage_newPCs.csv"))
clDavid <- davidAnnotation$leiden_0_6_caleb20PCs
mergedCl <- vector(length=nrow(davidAnnotation))
mergedCl[clDavid %in% c(13,9,6,1)] <- "HBC*"
mergedCl[clDavid %in% c(3,0,5,15)] <- "HBC"
mergedCl[clDavid %in% c(11)] <- "GBC"
mergedCl[clDavid %in% c(11)] <- "GBC"
mergedCl[clDavid %in% c(10,12,4)] <- "iOSN"
mergedCl[clDavid %in% c(8)] <- "mOSN"
mergedCl[clDavid %in% c(2)] <- "Sus"
mergedCl[clDavid %in% c(14)] <- "MV"
mergedCl[clDavid %in% c(7)] <- "RE HBC"
mergedCl[clDavid %in% c(16)] <- "RE"
mergedCl <- factor(mergedCl)

umDavid1 <- uwot::umap(davidPC, n_neighbors=20, min_dist=0.5)

plotDRTimePoint(dr=umDavid1, timePoint=factor(clDavid), cols=colsBig, legendCex=2/3)

plotDRTimePoint(dr=umDavid1, timePoint=factor(mergedCl), cols=colsBig, legendCex=4/5, legendPos="bottomright")

plotDRTimePoint(dr=umDavid1, timePoint=timePoint_noDoubs_async, cols=brewer.pal(9, 'Set1'), legendPos="bottomright")

plotDRTimePoint(dr=umDavid1, timePoint=factor(cl_noDoubs_async), cols=brewer.pal(9, 'Set1'), legendCex=4/5, legendPos="bottomleft", main="clusterExperiment labels")

Reproduce their pipeline using more PCs

normDavid <- sweep(counts_noDoubs_async, 2, colSums(counts_noDoubs_async), "/") * median(colSums(counts_noDoubs_async))

## genes that have been removed
davidGenes[!davidGenes %in% rownames(normDavid)]
##  [1] "4930523C07Rik" "4631405K08Rik" "2900093K20Rik" "1700013H16Rik"
##  [5] "Sprr2j-ps"     "1110017D15Rik" "1500011B03Rik" "RP24-492L15.6"
##  [9] "8430408G22Rik" "2200002D01Rik" "Hbb-bs"        "RP23-4H17.3"  
## [13] "1500009L16Rik" "1810011O10Rik" "1700012B09Rik" "2810417H13Rik"
## [17] "5730403I07Rik" "6330403K07Rik" "1700016K19Rik" "Krtap3-2"     
## [21] "Krtap2-4"      "Krtap17-1"     "Olfr465-ps1"   "1700001L19Rik"
## [25] "1700086L19Rik" "2300005B03Rik" "2610037D02Rik" "4732456N10Rik"
## [29] "1700097N02Rik" "H2-K1"         "H2-Ab1"        "H2-Aa"        
## [33] "H2-Eb1"        "H2-D1"         "H2-T23"        "2900055J20Rik"
## [37] "2610016A17Rik" "4430402I18Rik" "mt-Nd1"        "mt-Nd2"       
## [41] "mt-Co1"        "mt-Co2"        "mt-Atp6"       "mt-Co3"       
## [45] "mt-Nd3"        "mt-Nd4"        "mt-Nd5"        "mt-Cytb"      
## [49] "CreER\""
davidGenes2 <- intersect(davidGenes, rownames(normDavid))

## genes that have been removed
hvGenes <- order(rowVars(normDavid), decreasing=TRUE)[1:length(davidGenes2)]

pc100David_hvg <- irlba(log1p(normDavid[hvGenes,]), nv=100)

umDavid_hvg <- uwot::umap(pc100David_hvg$v, n_neighbors=20, min_dist=0.5)

plotDRTimePoint(dr=umDavid_hvg, timePoint=factor(mergedCl), cols=colsBig, legendCex=4/5, legendPos="topleft", main="Datta cluster labels")

plotDRTimePoint(dr=umDavid_hvg, timePoint=factor(timePoint_noDoubs_async), cols=brewer.pal(9, 'Set1'), legendCex=4/5, legendPos="topleft", main="Time point")

plotDRTimePoint(dr=umDavid_hvg, timePoint=factor(cl_noDoubs_async), cols=brewer.pal(9, 'Set1'), legendCex=4/5, legendPos="topleft", main="clusterExperiment labels")

Check expression of respiratory genes and other markers

# Respiratory cells
plotDRExpression(umDavid_hvg, log1p(counts_noDoubs_async["Adh7",])) + ggtitle("Adh7")

plotDRExpression(umDavid_hvg, log1p(counts_noDoubs_async["Reg3g",])) + ggtitle("Reg3g")

# HBC
plotDRExpression(umDavid_hvg, log1p(counts_noDoubs_async["Krt5",])) + ggtitle("Krt5")

plotDRExpression(umDavid_hvg, log1p(counts_noDoubs_async["Krt14",])) + ggtitle("Krt14")

plotDRExpression(umDavid_hvg, log1p(counts_noDoubs_async["Trp63",])) + ggtitle("Trp63")

# HBC*
plotDRExpression(umDavid_hvg, log1p(counts_noDoubs_async["Sprr1a",])) + ggtitle("Sprr1a")

plotDRExpression(umDavid_hvg, log1p(counts_noDoubs_async["Krtdap",])) + ggtitle("Krtdap")

# GBC
plotDRExpression(umDavid_hvg, log1p(counts_noDoubs_async["Ascl1",])) + ggtitle("Ascl1")

Remove respiratory and microvillous cells

counts_noResp_noMV <- counts_noDoubs_async[,!mergedCl %in% c("RE", "RE HBC", "MV")]

Save files

# counts
saveRDS(counts_noResp_noMV, file="../data/finalTrajectory/counts_noResp_noMV.rds")
# timePoints
timePoint_noResp_noMV <- timePoint_noDoubs_async[!mergedCl %in% c("RE", "RE HBC", "MV")]
saveRDS(timePoint_noResp_noMV, file="../data/finalTrajectory/timePoint_noResp_noMV.rds")
# Datta labels
mergedCl_noResp_noMV <- mergedCl[!mergedCl %in% c("RE", "RE HBC", "MV")]
saveRDS(mergedCl_noResp_noMV, file="../data/finalTrajectory/dattaCl_noResp_noMV.rds")
# clusterExperiment labels
cl_noResp_noMV <- timePoint_noDoubs_async[!mergedCl %in% c("RE", "RE HBC", "MV")]
saveRDS(cl_noResp_noMV, file="../data/finalTrajectory/rsecCl.rds")
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] irlba_2.3.3                 uwot_0.1.5                 
##  [3] Matrix_1.2-18               dplyr_0.8.4                
##  [5] ggplot2_3.3.2               RColorBrewer_1.1-2         
##  [7] clusterExperiment_2.6.1     bigmemory_4.5.36           
##  [9] rgl_0.100.30                SingleCellExperiment_1.8.0 
## [11] SummarizedExperiment_1.16.1 DelayedArray_0.12.2        
## [13] BiocParallel_1.20.1         matrixStats_0.56.0         
## [15] Biobase_2.46.0              GenomicRanges_1.38.0       
## [17] GenomeInfoDb_1.22.0         IRanges_2.20.2             
## [19] S4Vectors_0.24.3            BiocGenerics_0.32.0        
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.1.5         uuid_0.1-2              Hmisc_4.3-1            
##   [4] NMF_0.21.0              plyr_1.8.5              lazyeval_0.2.2         
##   [7] splines_3.6.2           crosstalk_1.0.0         rncl_0.8.3             
##  [10] gridBase_0.4-7          digest_0.6.23           foreach_1.4.7          
##  [13] htmltools_0.4.0         wesanderson_0.3.6       checkmate_2.0.0        
##  [16] magrittr_1.5            memoise_1.1.0           cluster_2.1.0          
##  [19] doParallel_1.0.15       limma_3.42.1            annotate_1.64.0        
##  [22] RcppParallel_4.4.4      prettyunits_1.1.1       jpeg_0.1-8.1           
##  [25] colorspace_1.4-1        blob_1.2.1              xfun_0.12              
##  [28] crayon_1.3.4            RCurl_1.98-1.1          jsonlite_1.6.1         
##  [31] bigmemory.sri_0.1.3     genefilter_1.68.0       phylobase_0.8.6        
##  [34] survival_3.1-8          iterators_1.0.12        ape_5.3                
##  [37] glue_1.3.1              registry_0.5-1          gtable_0.3.0           
##  [40] zlibbioc_1.32.0         XVector_0.26.0          webshot_0.5.2          
##  [43] kernlab_0.9-29          Rhdf5lib_1.8.0          HDF5Array_1.14.1       
##  [46] scales_1.1.0            DBI_1.1.0               edgeR_3.28.0           
##  [49] rngtools_1.5            bibtex_0.4.2.2          miniUI_0.1.1.1         
##  [52] Rcpp_1.0.3              viridisLite_0.3.0       xtable_1.8-4           
##  [55] progress_1.2.2          htmlTable_1.13.3        foreign_0.8-72         
##  [58] bit_1.1-15.2            Formula_1.2-3           htmlwidgets_1.5.1      
##  [61] httr_1.4.1              acepack_1.4.1           pkgconfig_2.0.3        
##  [64] XML_3.99-0.3            farver_2.0.3            nnet_7.3-12            
##  [67] locfit_1.5-9.1          labeling_0.3            howmany_0.3-1          
##  [70] tidyselect_1.0.0        rlang_0.4.4.9000        manipulateWidget_0.10.0
##  [73] softImpute_1.4          reshape2_1.4.3          later_1.0.0            
##  [76] AnnotationDbi_1.48.0    munsell_0.5.0           tools_3.6.2            
##  [79] RSQLite_2.2.0           ade4_1.7-13             evaluate_0.14          
##  [82] stringr_1.4.0           fastmap_1.0.1           yaml_2.2.1             
##  [85] knitr_1.28              bit64_0.9-7             purrr_0.3.3            
##  [88] nlme_3.1-142            mime_0.9                xml2_1.3.2             
##  [91] rstudioapi_0.11         compiler_3.6.2          png_0.1-7              
##  [94] tibble_2.1.3            RNeXML_2.4.2            stringi_1.4.5          
##  [97] RSpectra_0.16-0         lattice_0.20-38         vctrs_0.2.2            
## [100] pillar_1.4.3            lifecycle_0.1.0         RcppAnnoy_0.0.14       
## [103] zinbwave_1.11.6         data.table_1.12.8       bitops_1.0-6           
## [106] httpuv_1.5.2            R6_2.4.1                latticeExtra_0.6-29    
## [109] promises_1.1.0          gridExtra_2.3           codetools_0.2-16       
## [112] MASS_7.3-51.4           assertthat_0.2.1        rhdf5_2.30.1           
## [115] pkgmaker_0.31           withr_2.1.2             GenomeInfoDbData_1.2.2 
## [118] locfdr_1.1-8            hms_0.5.3               grid_3.6.2             
## [121] rpart_4.1-15            tidyr_1.0.2             rmarkdown_2.1          
## [124] shiny_1.4.0             base64enc_0.1-3